Take Home Ex 3

Author

Aruiana

Published

February 5, 2023

Modified

February 15, 2023

1. The Task

To uncover the salient patterns of the resale prices of public housing property by residential towns and estates in Singapore by using appropriate analytical visualisation techniques l

For the purpose of this study, the focus in on 3-ROOM, 4-ROOM and 5-ROOM types in 2022.

2. Data Preparation

##Step 1: Load Packages

Show
pacman::p_load(tidyverse, plotly, crosstalk, DT, ggdist, gganimate, ggstatsplot, heatmaply)

##Step 2: Import Data

Show
#import data
HDB <- read_csv(("data/HDB.csv"))

##Step3: Filter Data for the study

Filter out the data required: 1. Room Type 2. Year 2022

Show
#Filter 3Room, 4Room, 5Room, Filter 2022, Convert remaining lease into years
HDBRoom <- HDB %>% filter(flat_type=="3 ROOM" | flat_type=="4 ROOM" | flat_type=="5 ROOM") %>%
  separate(month, into = c("year", "month")) %>% 
  filter(year == "2022") %>%
  separate(remaining_lease, into = c("rmlease_years", "rmlease_month"), sep = "years") 

##Step 4: Amend Data Set 1. Convert the Month from Character to Number 2. Convert Remaining lease from Character to Number 3. Re-categorise towns into regions 4. Sort Storey Range by smallest to largest 5. Create new dataset for price/sqm

Show
#Convert Month from Chr to number
HDBRoom$month <- as.numeric(HDBRoom$month)

#Convert Remaining lease into numeric years in decimal
HDBRoom$rmlease_years <- as.numeric(HDBRoom$rmlease_years)

HDBRoom$rmlease_month <- gsub("[monthsmonth]", " ", HDBRoom$rmlease_month) %>%
  as.numeric(HDBRoom$rmlease_month) / 12 

HDBRoom$rmlease_month[is.na(HDBRoom$rmlease_month)] = 0

HDBRoom$rmlease <- as.numeric(HDBRoom$rmlease_years + HDBRoom$rmlease_month)

#Group Towns into Regions
HDBRoom$region <- case_when(
  HDBRoom$town %in% c("ANG MO KIO", "HOUGANG", "PUNGGOL", "SERANGOON", "SENGKANG") ~ "North-East",
    HDBRoom$town %in% c("BISHAN", "BUKIT MERAH", "BUKIT TIMAH", "CENTRAL AREA", "GEYLANG", "KALLANG/WHAMPOA", "MARINE PARADE", "QUEENSTOWN", "TOA PAYOH") ~ "Central",
    HDBRoom$town %in% c("BEDOK", "PASIR RIS", "TAMPINES") ~ "East",
    HDBRoom$town %in% c("SEMBAWANG", "WOODLANDS", "YISHUN") ~ "North",
    HDBRoom$town %in% c("BUKIT BATOK", "BUKIT PANJANG", "CHOA CHU KANG", "CLEMENTI", "JURONG EAST", "JURONG WEST") ~ "West")

#Edit storey range and sort by smallest
HDBRoom$storey_range <- str_replace(HDBRoom$storey_range, "TO", "-")

sr_sort = c("01 - 03", "04 - 06", "07 - 09", "10 - 12", "13 - 15", "16 - 18", "19 - 21", "22 - 24","25 - 27","28 - 30", "31 - 33", "34 - 36", "37 - 39", "40 - 42", "43 - 45", "49 - 51", "46 - 48")

HDBRoom$storey_range <- factor (HDBRoom$storey_range, levels = sr_sort)

#Create additional data on price per sqm
HDBRoom$price_per_sqm <- (HDBRoom$resale_price / HDBRoom$floor_area_sqm)

##Step 5: Select the relevant columns for analysis

Show
HDBDATA <- HDBRoom [,!names(HDBRoom) %in% c("year", "block", "street_name", "rmlease_years", "rmlease_month", "flat_model")]

#3. Data Analytics

Histogram of dataset

From the following histograms, we have the following findings: 1. The largest number of resale have 94 years left in the lease. This is almost immediately after the HDB MOP of 5 years for a 99year lease. This group should be the newest HDB flats. The next “peak” is at 60 years and this

Show
options(scipen = 999)

p1 <- gghistostats(
  data = HDBDATA, x = "rmlease",
  type = "bayes",
  test.value = 100,
  xlab = "Resale Property remaining lease"
  )

p2 <- gghistostats(
  data = HDBDATA, x = "month",
  type = "bayes",
  test.value = 100,
  xlab = "Month of Purchase"
  )

p3 <- gghistostats(
  data = HDBDATA, x = "resale_price",
  type = "bayes",
  test.value = 100,
  xlab = "Resale Price"
  )

p4 <- gghistostats(
  data = HDBDATA, x = "price_per_sqm",
  type = "bayes",
  test.value = 100,
  xlab = "Resale Price/sqm"
  )

p5 <- ggplot(
  data = HDBDATA, aes(x = town, y=rmlease, colour = flat_type)) + geom_point() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
    labs( x = "Resale by Town")

p6 <- ggplot(
  data = HDBDATA, aes(x = storey_range, fill = flat_type)) + geom_bar() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
    labs( x = "Resale by Storey")

(p1 + p2) / (p3 + p4)

Show
p5

Show
p6

Show
ggbetweenstats(
  data = HDBDATA,
  x = flat_type, 
  y = price_per_sqm,
  type = "np",
  messages = FALSE
)

Show
ggscatterstats(
  data = HDBDATA,
  x = resale_price,
  y = price_per_sqm,
  marginal = FALSE,
  )

Show
scdata <- highlight_key(HDBDATA) 
  
sc1 <- ggplot(data = scdata, aes(x = town, y = resale_price, colour = region)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(200000,500000,1000000,150000)) +
  labs(title = "Resale Price by Town", x = "Town", y = "Resale Price")

sc2 <- ggplot(data = scdata, aes(x = town, y = price_per_sqm, colour = region)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(3000,6000,9000,12000,15000)) +
    labs(title = "Resale Price per sqm by Town", x = "Town", y = "Resale Price/Sqm")

subplot(ggplotly(sc1), ggplotly(sc2))
Show
HDBDATA %>%
  mutate(class = fct_reorder(town, price_per_sqm, .fun="mean")) %>%
  ggplot(aes(y =reorder(town, price_per_sqm),
           x = price_per_sqm, fill = region)) + 
  geom_boxplot() + stat_summary(fun.y=mean, geom = "point", colour="yellow")

Show
HDBDATA %>% 
  group_by(region) %>%
  mutate(class = fct_reorder(region, price_per_sqm, .fun="mean")) %>%
  ggplot(mapping = aes(y = flat_type, x = price_per_sqm)) +
  # Make grouped boxplot
  geom_boxplot(aes(fill = as.factor(region))) +
  theme(legend.position = "top") +
  # Adjust lables and add title
  labs(title = "HDB resale prices in 2022 by region", y="Flat Type", x = "Price per square metre (SGD)", fill = "flat_type")

Show
HDBDATA %>%
  
grouped_gghistostats(
  x                 = resale_price,
  test.value        = 50,
  type              = "nonparametric",
  grouping.var      = region,
  normal.curve      = TRUE,
  normal.curve.args = list(color = "red", size = 1),
  ggtheme           = ggthemes::theme_tufte(),
  ## modify the defaults from `{ggstatsplot}` for each plot
  plotgrid.args     = list(nrow = 2),
  annotation.args   = list(title = "Resale price by region")
)

Show
floorheatmap <-
  HDBDATA %>%
  group_by(town, storey_range) %>%
  summarise(median_price = median(price_per_sqm))

heatmap <- ggplot(data = floorheatmap, 
                  mapping = aes(x = town, y = storey_range, fill = median_price)) +
            geom_tile() +
  labs(title = "Heatmap of HDB breakdown by area and storey", x = "Town", y = "Storey") +
  scale_fill_gradient(name = "Median Resale Price/sqm",
                      low = "peachpuff",
                      high = "deeppink4")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

heatmap

Show
a <-
ggplot(HDBDATA, aes(x = rmlease, y = resale_price, 
                      size = floor_area_sqm, 
                      colour = region)) +
  geom_point(alpha = 0.7, 
             show.legend = FALSE) +
  scale_size(range = c(2, 12)) +
  labs(title = '2022: {as.integer(frame_time)} Month', 
       x = 'Remaining Lease', 
       y = 'Resale Price') +
  transition_time(month) +       #<<
  ease_aes('linear')            #<<

a